notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from collections import Counter

from IPython.display import HTML, display
import statsmodels.api as sm









    



/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [2]:

    
sns.set_context("poster")
sns.set_style("ticks")



In [3]:

    
TOPIC_MAPPING={
    "GunControl": "Gun Control",
    "Privacy": "Privacy",
    "Vaccine": "Vaccine",
    "ChildEducation": "Child Education",
    "SkinDamage": "Skin Damage",
    "SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
             "Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
        #u'is_controvertial': u'is_controversial'
    }).assign(
    topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
              "MP", "PR", "VI"])


STATE_POPULATIONS="""4863300.00	AL
741894.00	AK
6931071.00	AZ
2988248.00	AR
39250017.00	CA
5540545.00	CO
3576452.00	CT
952065.00	DE
681170.00	DC
20612439.00	FL
10310371.00	GA
1428557.00	HI
1683140.00	ID
12801539.00	IL
6633053.00	IN
3134693.00	IA
2907289.00	KS
4436974.00	KY
4681666.00	LA
1331479.00	ME
6016447.00	MD
6811779.00	MA
9928300.00	MI
5519952.00	MN
2988726.00	MS
6093000.00	MO
1042520.00	MT
1907116.00	NE
2940058.00	NV
1334795.00	NH
8944469.00	NJ
2081015.00	NM
19745289.00	NY
10146788.00	NC
757952.00	ND
11614373.00	OH
3923561.00	OK
4093465.00	OR
12784227.00	PA
1056426.00	RI
4961119.00	SC
865454.00	SD
6651194.00	TN
27862596.00	TX
3051217.00	UT
624594.00	VT
8411808.00	VA
7288000.00	WA
1831102.00	WV
5778708.00	WI
585501.00	WY
""".splitlines()

STATE_POPULATIONS = {k:float(v) for v,k in map(lambda x: x.split('\t'), STATE_POPULATIONS)}


STATE_NAMES_ABBR="""AK	Alaska
AL	Alabama
AR	Arkansas
AZ	Arizona
CA	California
CO	Colorado
CT	Connecticut
DC	Dist. of Col.
DE	Delaware
FL	Florida
GA	Georgia
GU	Guam
HI	Hawaii
IA	Iowa
ID	Idaho
IL	Illinois
IN	Indiana
KS	Kansas
KY	Kentucky
LA	Louisiana
MA	Massachusetts
MD	Maryland
ME	Maine
MI	Michigan
MN	Minnesota
MO	Missouri
MS	Mississippi
MT	Montana
NC	North Carolina
ND	North Dakota
NE	Nebraska
NH	New Hampshire
NJ	New Jersey
NM	New Mexico
NV	Nevada
NY	New York
OH	Ohio
OK	Oklahoma
OR	Oregon
PA	Pennsylvania
PR	Puerto Rico
RI	Rhode Island
SC	South Carolina
SD	South Dakota
TN	Tennessee
TX	Texas
UT	Utah
VA	Virginia
VI	Virgin Islands
VT	Vermont
WA	Washington
WI	Wisconsin
WV	West Virginia
WY	Wyoming""".splitlines()
STATE_NAMES_ABBR = {
    k: v for v,k in map(lambda x: x.split("\t"), STATE_NAMES_ABBR)
}

## Source: http://www.presidency.ucsb.edu/showelection.php?year=2016

STATE_VOTES="""STATE	TOTAL VOTES	Hillary Votes	%	EV	Trump Votes	%	EV	Other Votes	%	EV
Alabama	2,123,372	729,547	34.4%	 	1,318,255	62.1%	9	44,467	2.1%	 
Alaska	318,608	116,454	36.6%	 	163,387	51.3%	3	18,725	5.9%	 
Arizona	2,573,165	1,161,167	45.1%	 	1,252,401	48.7%	11	106,327	4.1%	 
Arkansas	1,130,635	380,494	33.7%	 	684,872	60.6%	6	29,829	2.6%	 
California	14,181,595	8,753,788	61.7%	55	4,483,810	31.6%	 	478,500	3.4%	 
Colorado	2,780,220	1,338,870	48.2%	9	1,202,484	43.3%	 	144,121	5.2%	 
Connecticut	1,644,920	897,572	54.6%	7	673,215	40.9%	 	48,676	3.0%	 
Delaware	441,590	235,603	53.4%	3	185,127	41.9%	 	14,757	3.3%	 
Dist. of Col.	311,268	282,830	90.9%	3	12,723	4.1%	 	4,906	1.6%	 
Florida	9,420,039	4,504,975	47.8%	 	4,617,886	49.0%	29	207,043	2.2%	 
Georgia	4,092,373	1,877,963	45.9%	 	2,089,104	51.0%	16	125,306	3.1%	 
Hawaii	428,937	266,891	62.2%	3*	128,847	30.0%	 	15,954	3.7%	 
Idaho	690,255	189,765	27.5%	 	409,055	59.3%	4	28,331	4.1%	 
Illinois	5,536,424	3,090,729	55.8%	20	2,146,015	38.8%	 	209,596	3.8%	 
Indiana	2,734,958	1,033,126	37.8%	 	1,557,286	56.9%	11	133,993	4.9%	 
Iowa	1,566,031	653,669	41.7%	 	800,983	51.1%	6	59,186	3.8%	 
Kansas	1,184,402	427,005	36.1%	 	671,018	56.7%	6	55,406	4.7%	 
Kentucky	1,924,149	628,854	32.7%	 	1,202,971	62.5%	8	53,752	2.8%	 
Louisiana	2,029,032	780,154	38.4%	 	1,178,638	58.1%	8	37,978	1.9%	 
Maine	747,927	357,735	47.8%	3	335,593	44.9%	1	38,105	5.1%	 
Maryland	2,781,446	1,677,928	60.3%	10	943,169	33.9%	 	79,605	2.9%	 
Massachusetts	3,325,046	1,995,196	60.0%	11	1,090,893	32.8%	 	138,018	4.2%	 
Michigan	4,799,284	2,268,839	47.3%	 	2,279,543	47.5%	16	172,136	3.6%	 
Minnesota	2,944,813	1,367,716	46.4%	10	1,322,951	44.9%	 	112,972	3.8%	 
Mississippi	1,209,357	485,131	40.1%	 	700,714	57.9%	6	14,435	1.2%	 
Missouri	2,808,605	1,071,068	38.1%	 	1,594,511	56.8%	10	97,359	3.5%	 
Montana	494,526	177,709	35.9%	 	279,240	56.5%	3	28,037	5.7%	 
Nebraska	844,227	284,494	33.7%	 	495,961	58.7%	2	38,946	4.6%	 
Nevada	1,125,385	539,260	47.9%	6	512,058	45.5%	 	37,384	3.3%	 
New Hampshire	744,296	348,526	46.8%	4	345,790	46.5%	 	30,777	4.1%	 
New Jersey	3,874,046	2,148,278	55.5%	14	1,601,933	41.4%	 	72,477	1.9%	 
New Mexico	798,318	385,234	48.3%	5	319,666	40.0%	 	74,541	9.3%	 
New York	7,721,453	4,556,124	59.0%	29	2,819,534	36.5%	 	176,598	2.3%	 
North Carolina	4,741,564	2,189,316	46.2%	 	2,362,631	49.8%	15	130,126	2.7%	 
North Dakota	344,360	93,758	27.2%	 	216,794	63.0%	3	21,434	6.2%	 
Ohio	5,496,487	2,394,164	43.6%	 	2,841,005	51.7%	18	174,498	3.2%	 
Oklahoma	1,452,992	420,375	28.9%	 	949,136	65.3%	7	83,481	5.7%	 
Oregon	2,001,336	1,002,106	50.1%	7	782,403	39.1%	 	94,231	4.7%	 
Pennsylvania	6,115,402	2,926,441	47.9%	 	2,970,733	48.6%	20	146,715	2.4%	 
Rhode Island	464,144	252,525	54.4%	4	180,543	38.9%	 	14,746	3.2%	 
South Carolina	2,103,027	855,373	40.7%	 	1,155,389	54.9%	9	49,204	2.3%	 
South Dakota	370,093	117,458	31.7%	 	227,721	61.5%	3	20,850	5.6%	 
Tennessee	2,508,027	870,695	34.7%	 	1,522,925	60.7%	11	70,397	2.8%	 
Texas	8,969,226	3,877,868	43.2%	 	4,685,047	52.2%	36*	283,492	3.2%	 
Utah	1,131,430	310,676	27.5%	 	515,231	45.5%	6	39,608	3.5%	 
Vermont	315,067	178,573	56.7%	3	95,369	30.3%	 	10,078	3.2%	 
Virginia	3,982,752	1,981,473	49.8%	13	1,769,443	44.4%	 	118,274	3.0%	 
Washington	3,209,214	1,742,718	54.3%	8*	1,221,747	38.1%	 	160,879	5.0%	 
West Virginia	713,051	188,794	26.5%	 	489,371	68.6%	5	23,004	3.2%	 
Wisconsin	2,976,150	1,382,536	46.5%	 	1,405,284	47.2%	10	106,674	3.6%	 
Wyoming	255,849	55,973	21.9%	 	174,419	68.2%	3	13,287	5.2%""".splitlines()
STATE_VOTES = list(map(lambda x: x.replace(",", "").replace("%", "").split("\t"), STATE_VOTES))

STATE_COLORS={
    STATE_NAMES_ABBR[x[0]]: "b" if int(x[2]) > int(x[5]) else "r"
    for x in STATE_VOTES[1:]
}

print(STATE_COLORS)



CHOROGRID_STATES_FILE='/content/Code/smishra8/chorogrid/chorogrid/databases/usa_states.csv'


def logit_transform(p):
    eps = 1e-8
    return np.log((p + eps)/(1-p + eps))









    



{'WA': 'b', 'DE': 'b', 'DC': 'b', 'WI': 'r', 'WV': 'r', 'HI': 'b', 'FL': 'r', 'WY': 'r', 'NH': 'b', 'NJ': 'b', 'NM': 'b', 'TX': 'r', 'LA': 'r', 'NC': 'r', 'ND': 'r', 'NE': 'r', 'TN': 'r', 'NY': 'b', 'PA': 'r', 'CA': 'b', 'NV': 'b', 'VA': 'b', 'CO': 'b', 'AK': 'r', 'AL': 'r', 'AR': 'r', 'VT': 'b', 'IL': 'b', 'GA': 'r', 'IN': 'r', 'IA': 'r', 'OK': 'r', 'AZ': 'r', 'ID': 'r', 'CT': 'b', 'ME': 'b', 'MD': 'b', 'MA': 'b', 'OH': 'r', 'UT': 'r', 'MO': 'r', 'MN': 'b', 'MI': 'r', 'RI': 'b', 'KS': 'r', 'MT': 'r', 'MS': 'r', 'SC': 'r', 'KY': 'r', 'OR': 'b', 'SD': 'r'}



In [4]:

    
STATE_POPULATIONS["AZ"]









    Out[4]:





6931071.0



In [5]:

    
df.columns









    Out[5]:





Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state'],
      dtype='object')



In [6]:

    
df.CATS.fillna(0).apply(
    lambda x: Counter(['UNK']) 
    if x == 0 
    else Counter(x)
).apply(lambda x: len(x)).describe()









    Out[6]:





count    246869.000000
mean          1.139163
std           0.356983
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           5.000000
Name: CATS, dtype: float64



In [7]:

    
df["CATS_Counter"] = df.CATS.fillna(0).apply(
    lambda x: Counter(['NONE']) 
    if x == 0 
    else Counter(x)
)
df[df.CATS_Counter.apply(lambda x: len(x)) == 2]["CATS_Counter"].head()









    Out[7]:





23     {u'socialmedia': 1, u'videos': 1}
29    {u'twitter': 1, u'socialmedia': 1}
38     {u'socialmedia': 1, u'videos': 1}
53     {u'socialmedia': 1, u'videos': 1}
54    {u'twitter': 1, u'socialmedia': 1}
Name: CATS_Counter, dtype: object



In [8]:

    
df.u_state.isnull().value_counts().to_frame().assign(props=lambda x: x["u_state"]/x["u_state"].sum())



In [9]:

    
(~df.u_state.isnull() & (df.u_state !="USA")).value_counts().to_frame().assign(props=lambda x: x["u_state"]/x["u_state"].sum())

Proportion of urls controlling for party and population



In [10]:

    
url_types = ["fakenews", "news",]
for ui, url_type in enumerate(url_types):
        print ui, url_type
        for i, topic in enumerate(topic_order[:3]):
            print i, topic
            df_t = (df[
                        (df.u_state != "USA")
                      & (~df.u_state.isnull())
                      & (df.t_n_urls > 0)
                      & (df.topic_name == topic)
                    ]
                    .assign(**{
                        "url_type": lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0) > 0).astype("int"),
                        "population": lambda x: x.u_state.map(STATE_POPULATIONS, 1),
                        "state_color": lambda x: x.u_state.map(STATE_COLORS, "k")
                    })).query("state_color!='k'")
            model = sm.Logit.from_formula("url_type ~ np.log10(population) + state_color", data=df_t).fit()
            display(model.summary2())









    



0 fakenews
0 Gun Control
Optimization terminated successfully.
         Current function value: 0.386068
         Iterations 6






    






        Model:               Logit       Pseudo R-squared:     0.006  


  Dependent Variable:      url_type            AIC:          8527.2848


         Date:         2018-01-25 21:09        BIC:          8549.2115


   No. Observations:         11036        Log-Likelihood:     -4260.6 


       Df Model:               2             LL-Null:         -4287.5 


     Df Residuals:           11033         LLR p-value:     2.1182e-12


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         6.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -2.2820   0.4656   -4.9009  0.0000  -3.1946  -1.3694


  state_color[T.r]      0.4146    0.0567   7.3075   0.0000  0.3034   0.5258 


  np.log10(population)  0.0262    0.0660   0.3974   0.6911  -0.1031  0.1556 








    



1 Privacy
Optimization terminated successfully.
         Current function value: 0.214723
         Iterations 7






    






        Model:               Logit       Pseudo R-squared:    0.001  


  Dependent Variable:      url_type            AIC:         9272.1480


         Date:         2018-01-25 21:09        BIC:         9296.0862


   No. Observations:         21577        Log-Likelihood:    -4633.1 


       Df Model:               2             LL-Null:        -4636.0 


     Df Residuals:           21574         LLR p-value:     0.053933 


      Converged:            1.0000            Scale:         1.0000  


    No. Iterations:         7.0000                                  




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -2.2916   0.4109   -5.5778  0.0000  -3.0969  -1.4864


  state_color[T.r]      0.1250    0.0607   2.0604   0.0394  0.0061   0.2439 


  np.log10(population)  -0.0846   0.0591   -1.4307  0.1525  -0.2005  0.0313 








    



2 Vaccine
Optimization terminated successfully.
         Current function value: 0.312787
         Iterations 6






    






        Model:               Logit       Pseudo R-squared:     0.002  


  Dependent Variable:      url_type            AIC:          9300.7812


         Date:         2018-01-25 21:09        BIC:          9323.6000


   No. Observations:         14858        Log-Likelihood:     -4647.4 


       Df Model:               2             LL-Null:         -4656.8 


     Df Residuals:           14855         LLR p-value:     7.8778e-05


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         6.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -2.5120   0.4729   -5.3114  0.0000  -3.4390  -1.5851


  state_color[T.r]      0.2491    0.0573   4.3469   0.0000  0.1368   0.3613 


  np.log10(population)  0.0214    0.0660   0.3251   0.7451  -0.1078  0.1507 








    



1 news
0 Gun Control
Optimization terminated successfully.
         Current function value: 0.559483
         Iterations 5






    






        Model:               Logit       Pseudo R-squared:     0.009  


  Dependent Variable:      url_type            AIC:         12354.9002


         Date:         2018-01-25 21:09        BIC:         12376.8269


   No. Observations:         11036        Log-Likelihood:     -6174.5 


       Df Model:               2             LL-Null:         -6230.0 


     Df Residuals:           11033         LLR p-value:     7.6243e-25


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.      z      P>|z|  [0.025   0.975] 


  Intercept             -0.2492   0.3413    -0.7301  0.4654  -0.9181  0.4198 


  state_color[T.r]      -0.4638   0.0449   -10.3253  0.0000  -0.5519  -0.3758


  np.log10(population)  -0.0911   0.0485    -1.8780  0.0604  -0.1861  0.0040 








    



1 Privacy
Optimization terminated successfully.
         Current function value: 0.571952
         Iterations 5






    






        Model:               Logit       Pseudo R-squared:     0.002  


  Dependent Variable:      url_type            AIC:         24687.9996


         Date:         2018-01-25 21:09        BIC:         24711.9377


   No. Observations:         21577        Log-Likelihood:     -12341. 


       Df Model:               2             LL-Null:         -12361. 


     Df Residuals:           21574         LLR p-value:     2.9467e-09


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -1.0333   0.2144   -4.8190  0.0000  -1.4536  -0.6131


  state_color[T.r]      -0.2024   0.0325   -6.2358  0.0000  -0.2660  -0.1388


  np.log10(population)  0.0088    0.0307   0.2865   0.7745  -0.0515  0.0691 








    



2 Vaccine
Optimization terminated successfully.
         Current function value: 0.490474
         Iterations 5






    






        Model:               Logit       Pseudo R-squared:     0.004  


  Dependent Variable:      url_type            AIC:         14580.9189


         Date:         2018-01-25 21:09        BIC:         14603.7378


   No. Observations:         14858        Log-Likelihood:     -7287.5 


       Df Model:               2             LL-Null:         -7319.0 


     Df Residuals:           14855         LLR p-value:     1.9532e-14


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -0.1177   0.3301   -0.3565  0.7215  -0.7646  0.5292 


  state_color[T.r]      -0.3293   0.0441   -7.4693  0.0000  -0.4157  -0.2429


  np.log10(population)  -0.1670   0.0462   -3.6135  0.0003  -0.2575  -0.0764



In [11]:

    
model = sm.Logit.from_formula("url_type ~ np.log10(population) + state_color", data=df_t).fit()
model.summary2()









    



Optimization terminated successfully.
         Current function value: 0.490474
         Iterations 5






    Out[11]:






        Model:               Logit       Pseudo R-squared:     0.004  


  Dependent Variable:      url_type            AIC:         14580.9189


         Date:         2018-01-25 21:09        BIC:         14603.7378


   No. Observations:         14858        Log-Likelihood:     -7287.5 


       Df Model:               2             LL-Null:         -7319.0 


     Df Residuals:           14855         LLR p-value:     1.9532e-14


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -0.1177   0.3301   -0.3565  0.7215  -0.7646  0.5292 


  state_color[T.r]      -0.3293   0.0441   -7.4693  0.0000  -0.4157  -0.2429


  np.log10(population)  -0.1670   0.0462   -3.6135  0.0003  -0.2575  -0.0764



In [12]:

    
def significance_stars(p):
    """
    ns P > 0.05
    * P ≤ 0.05
    ** P ≤ 0.01
    *** P ≤ 0.001
    **** P ≤ 0.0001 (For the last two choices only)
    """
    if p > 0.05:
        return ""
    if p >= 0.01:
        return "*"
    if p > 0.001:
        return "**"
    if p > 0.0001:
        return "***"
    return "****"



In [13]:

    
print "{} $(\\beta_{{Rep}}={:.3f}^{{{}}})$".format(
    topic,
    model.summary2().tables[1].loc["state_color[T.r]", "Coef."],
    significance_stars(model.summary2().tables[1].loc["state_color[T.r]", "P>|z|"]))









    



Vaccine $(\beta_{Rep}=-0.329^{****})$



In [14]:

    
def plot_by_topic(df, nstates=10):
    url_types = ["fakenews", "news",]
    ncols = len(topic_order[:3])
    nrows = len(url_types)
    w, h = 6, 6
    fig, ax = plt.subplots(
        nrows, ncols,
        sharex=True,
        #sharey="col",
        figsize=(ncols*w,nrows*h)
    )
    for ui, url_type in enumerate(url_types):
        print ui, url_type
        for i, topic in enumerate(topic_order[:3]):
            print i, topic
            df_t = (
                df[
                    (df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                ]
                .assign(**{
                    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
                })
                [["u_state", url_type]].groupby("u_state")[url_type]
                .agg([np.sum, len, np.mean])
                .reset_index()
                .rename(columns={
                    "sum": "success",
                    "len": "total",
                    "mean": "proportion"
                })
                .assign(
                    failure=lambda x: x["total"] - x["success"],
                    population=lambda x: x.u_state.map(STATE_POPULATIONS, 1),
                    std_err=lambda x: np.sqrt(x["proportion"]*(1-x["proportion"])/x["total"])
                )
                .dropna()
                .assign(
                    log_odds=lambda x: logit_transform(x["proportion"]),
                    #population=lambda x: np.log10(x["population"])
                )
            )
            display(df_t.sort_values("proportion", ascending=False).head(10))
            X, Y, yerr = df_t.population.values, df_t.proportion.values, df_t.std_err.values
            """ax[ui, i].errorbar(
                X, Y, yerr=yerr,
                linestyle="none", elinewidth=1,
                color="k", marker="o", ms=5,
                alpha=0.5
            )"""
            ax[ui, i].plot(
                X, Y,
                linestyle="none",
                color="k", marker="o", ms=5,
                alpha=0.5
            )
            overall_p = df_t.success.sum()*1./df_t.total.sum()
            overall_err=np.sqrt(overall_p*(1-overall_p)/df_t.total.sum())
            #ax[i].axhline(y=logit_transform(overall_p), ls="--", color="0.5")
            ax[ui, i].axhline(y=overall_p, ls="--", lw=1, color="0.5")
            ax[ui, i].axhspan(
                overall_p-overall_err, overall_p+overall_err,
                color="0.9", alpha=0.3
            )
            #ax[ui, i].axhline(y=overall_p, ls="--", lw=1, color="0.5")
            #ax[i, ui].set_xlabel("$log_{10}(population)$")
            ax[ui, i].set_xscale("log")
            ax[ui, i].set_ylabel("$p(url={})$".format(url_type))
            for j, state in enumerate(df_t.u_state.values):
                ax[ui, i].text(
                    X[j], Y[j], state,
                    fontsize=14, color=STATE_COLORS[state], alpha=0.7
                )
            ## Fit model and plot
            df_t = (df[
                        (df.u_state != "USA")
                      & (~df.u_state.isnull())
                      & (df.t_n_urls > 0)
                      & (df.topic_name == topic)
                    ]
                    .assign(**{
                        "url_type": lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0) > 0).astype("int"),
                        "population": lambda x: x.u_state.map(STATE_POPULATIONS, 1),
                        "state_color": lambda x: x.u_state.map(STATE_COLORS, "k")
                    })).query("state_color!='k'")
            model = sm.Logit.from_formula("url_type ~ np.log10(population) + state_color", data=df_t).fit()
            display(model.summary2())   
            for sc in ["r", "b"]:
                df_tt = pd.DataFrame({
                    "population": 10**np.arange(5.5, 8.1, 0.1)
                }).assign(state_color=sc)
                x = df_tt.population
                y = model.predict(df_tt)
                ax[ui, i].plot(x, y, color=sc, linestyle="--", lw=2, alpha=0.7)
            title = "{} $(\\beta_{{Rep}}={:.3f}^{{{}}})$".format(
                topic,
                model.summary2().tables[1].loc["state_color[T.r]", "Coef."],
                significance_stars(model.summary2().tables[1].loc["state_color[T.r]", "P>|z|"])
            )
            ax[ui, i].set_title(title)
        #fig.suptitle(url_type.upper(), fontsize=16)
        #plt.subplots_adjust(hspace=0.2, wspace=0.3)
    sns.despine(offset=10)
    fig.tight_layout()
    plt.savefig("Population Party Proportion.pdf", bbox_inches="tight")
nstates=None



In [15]:

    
plot_by_topic(df, nstates=nstates)









    



0 fakenews
0 Gun Control






    







  
    
      
      u_state
      success
      total
      proportion
      failure
      population
      std_err
      log_odds
    
  
  
    
      43
      SD
      5
      10
      0.500000
      5
      865454.0
      0.158114
      0.000000
    
    
      0
      AK
      5
      14
      0.357143
      9
      741894.0
      0.128060
      -0.587787
    
    
      1
      AL
      32
      130
      0.246154
      98
      4863300.0
      0.037781
      -1.119232
    
    
      51
      WV
      4
      20
      0.200000
      16
      1831102.0
      0.089443
      -1.386294
    
    
      50
      WI
      41
      206
      0.199029
      165
      5778708.0
      0.027818
      -1.392373
    
    
      47
      VA
      61
      330
      0.184848
      269
      8411808.0
      0.021368
      -1.483837
    
    
      13
      IA
      12
      67
      0.179104
      55
      3134693.0
      0.046845
      -1.522426
    
    
      10
      FL
      126
      707
      0.178218
      581
      20612439.0
      0.014393
      -1.528469
    
    
      45
      TX
      167
      938
      0.178038
      771
      27862596.0
      0.012491
      -1.529695
    
    
      44
      TN
      29
      165
      0.175758
      136
      6651194.0
      0.029631
      -1.545359
    
  








    



Optimization terminated successfully.
         Current function value: 0.386068
         Iterations 6






    






        Model:               Logit       Pseudo R-squared:     0.006  


  Dependent Variable:      url_type            AIC:          8527.2848


         Date:         2018-01-25 21:09        BIC:          8549.2115


   No. Observations:         11036        Log-Likelihood:     -4260.6 


       Df Model:               2             LL-Null:         -4287.5 


     Df Residuals:           11033         LLR p-value:     2.1182e-12


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         6.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -2.2820   0.4656   -4.9009  0.0000  -3.1946  -1.3694


  state_color[T.r]      0.4146    0.0567   7.3075   0.0000  0.3034   0.5258 


  np.log10(population)  0.0262    0.0660   0.3974   0.6911  -0.1031  0.1556 








    



1 Privacy






    







  
    
      
      u_state
      success
      total
      proportion
      failure
      population
      std_err
      log_odds
    
  
  
    
      0
      AK
      8
      41
      0.195122
      33
      741894.0
      0.061891
      -1.417066
    
    
      1
      AL
      33
      219
      0.150685
      186
      4863300.0
      0.024174
      -1.729239
    
    
      21
      ME
      9
      88
      0.102273
      79
      1331479.0
      0.032301
      -2.172223
    
    
      12
      IA
      12
      139
      0.086331
      127
      3134693.0
      0.023822
      -2.359280
    
    
      32
      NJ
      43
      500
      0.086000
      457
      8944469.0
      0.012538
      -2.363483
    
    
      27
      MT
      4
      47
      0.085106
      43
      1042520.0
      0.040702
      -2.374906
    
    
      50
      WI
      20
      257
      0.077821
      237
      5778708.0
      0.016711
      -2.472328
    
    
      51
      WV
      2
      26
      0.076923
      24
      1831102.0
      0.052259
      -2.484907
    
    
      23
      MN
      15
      200
      0.075000
      185
      5519952.0
      0.018625
      -2.512306
    
    
      9
      FL
      92
      1252
      0.073482
      1160
      20612439.0
      0.007374
      -2.534387
    
  








    



Optimization terminated successfully.
         Current function value: 0.214723
         Iterations 7






    






        Model:               Logit       Pseudo R-squared:    0.001  


  Dependent Variable:      url_type            AIC:         9272.1480


         Date:         2018-01-25 21:09        BIC:         9296.0862


   No. Observations:         21577        Log-Likelihood:    -4633.1 


       Df Model:               2             LL-Null:        -4636.0 


     Df Residuals:           21574         LLR p-value:     0.053933 


      Converged:            1.0000            Scale:         1.0000  


    No. Iterations:         7.0000                                  




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -2.2916   0.4109   -5.5778  0.0000  -3.0969  -1.4864


  state_color[T.r]      0.1250    0.0607   2.0604   0.0394  0.0061   0.2439 


  np.log10(population)  -0.0846   0.0591   -1.4307  0.1525  -0.2005  0.0313 








    



2 Vaccine






    







  
    
      
      u_state
      success
      total
      proportion
      failure
      population
      std_err
      log_odds
    
  
  
    
      32
      NM
      14
      57
      0.245614
      43
      2081015.0
      0.057015
      -1.122143
    
    
      5
      CO
      51
      232
      0.219828
      181
      5540545.0
      0.027189
      -1.266671
    
    
      33
      NV
      38
      176
      0.215909
      138
      2940058.0
      0.031014
      -1.289667
    
    
      21
      ME
      14
      74
      0.189189
      60
      1331479.0
      0.045529
      -1.455287
    
    
      49
      WI
      37
      197
      0.187817
      160
      5778708.0
      0.027827
      -1.464256
    
    
      3
      AZ
      47
      265
      0.177358
      218
      6931071.0
      0.023464
      -1.534347
    
    
      0
      AK
      3
      17
      0.176471
      14
      741894.0
      0.092459
      -1.540445
    
    
      13
      ID
      7
      40
      0.175000
      33
      1683140.0
      0.060078
      -1.550597
    
    
      6
      CT
      15
      90
      0.166667
      75
      3576452.0
      0.039284
      -1.609438
    
    
      31
      NJ
      53
      336
      0.157738
      283
      8944469.0
      0.019885
      -1.675155
    
  








    



Optimization terminated successfully.
         Current function value: 0.312787
         Iterations 6






    






        Model:               Logit       Pseudo R-squared:     0.002  


  Dependent Variable:      url_type            AIC:          9300.7812


         Date:         2018-01-25 21:09        BIC:          9323.6000


   No. Observations:         14858        Log-Likelihood:     -4647.4 


       Df Model:               2             LL-Null:         -4656.8 


     Df Residuals:           14855         LLR p-value:     7.8778e-05


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         6.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -2.5120   0.4729   -5.3114  0.0000  -3.4390  -1.5851


  state_color[T.r]      0.2491    0.0573   4.3469   0.0000  0.1368   0.3613 


  np.log10(population)  0.0214    0.0660   0.3251   0.7451  -0.1078  0.1507 








    



1 news
0 Gun Control






    







  
    
      
      u_state
      success
      total
      proportion
      failure
      population
      std_err
      log_odds
    
  
  
    
      8
      DC
      194
      429
      0.452214
      235
      681170.0
      0.024030
      -0.191727
    
    
      2
      AR
      24
      68
      0.352941
      44
      2988248.0
      0.057952
      -0.606136
    
    
      35
      NY
      394
      1154
      0.341421
      760
      19745289.0
      0.013959
      -0.656968
    
    
      29
      ND
      2
      6
      0.333333
      4
      757952.0
      0.192450
      -0.693147
    
    
      48
      VT
      7
      21
      0.333333
      14
      624594.0
      0.102869
      -0.693147
    
    
      15
      IL
      132
      421
      0.313539
      289
      12801539.0
      0.022611
      -0.783625
    
    
      12
      HI
      9
      29
      0.310345
      20
      1428557.0
      0.085909
      -0.798508
    
    
      49
      WA
      122
      394
      0.309645
      272
      7288000.0
      0.023293
      -0.801781
    
    
      6
      CO
      73
      239
      0.305439
      166
      5540545.0
      0.029793
      -0.821528
    
    
      0
      AK
      4
      14
      0.285714
      10
      741894.0
      0.120736
      -0.916291
    
  








    



Optimization terminated successfully.
         Current function value: 0.559483
         Iterations 5






    






        Model:               Logit       Pseudo R-squared:     0.009  


  Dependent Variable:      url_type            AIC:         12354.9002


         Date:         2018-01-25 21:09        BIC:         12376.8269


   No. Observations:         11036        Log-Likelihood:     -6174.5 


       Df Model:               2             LL-Null:         -6230.0 


     Df Residuals:           11033         LLR p-value:     7.6243e-25


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.      z      P>|z|  [0.025   0.975] 


  Intercept             -0.2492   0.3413    -0.7301  0.4654  -0.9181  0.4198 


  state_color[T.r]      -0.4638   0.0449   -10.3253  0.0000  -0.5519  -0.3758


  np.log10(population)  -0.0911   0.0485    -1.8780  0.0604  -0.1861  0.0040 








    



1 Privacy






    







  
    
      
      u_state
      success
      total
      proportion
      failure
      population
      std_err
      log_odds
    
  
  
    
      8
      DE
      10
      22
      0.454545
      12
      952065.0
      0.106159
      -0.182322
    
    
      48
      VT
      16
      48
      0.333333
      32
      624594.0
      0.068041
      -0.693147
    
    
      28
      NC
      168
      508
      0.330709
      340
      10146788.0
      0.020874
      -0.704982
    
    
      24
      MO
      62
      189
      0.328042
      127
      6093000.0
      0.034151
      -0.717053
    
    
      11
      HI
      27
      84
      0.321429
      57
      1428557.0
      0.050957
      -0.747214
    
    
      7
      DC
      574
      1846
      0.310943
      1272
      681170.0
      0.010773
      -0.795716
    
    
      19
      MA
      184
      607
      0.303130
      423
      6811779.0
      0.018655
      -0.832436
    
    
      31
      NH
      31
      103
      0.300971
      72
      1334795.0
      0.045195
      -0.842679
    
    
      29
      ND
      3
      10
      0.300000
      7
      757952.0
      0.144914
      -0.847298
    
    
      33
      NM
      18
      61
      0.295082
      43
      2081015.0
      0.058395
      -0.870828
    
  








    



Optimization terminated successfully.
         Current function value: 0.571952
         Iterations 5






    






        Model:               Logit       Pseudo R-squared:     0.002  


  Dependent Variable:      url_type            AIC:         24687.9996


         Date:         2018-01-25 21:09        BIC:         24711.9377


   No. Observations:         21577        Log-Likelihood:     -12341. 


       Df Model:               2             LL-Null:         -12361. 


     Df Residuals:           21574         LLR p-value:     2.9467e-09


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -1.0333   0.2144   -4.8190  0.0000  -1.4536  -0.6131


  state_color[T.r]      -0.2024   0.0325   -6.2358  0.0000  -0.2660  -0.1388


  np.log10(population)  0.0088    0.0307   0.2865   0.7745  -0.0515  0.0691 








    



2 Vaccine






    







  
    
      
      u_state
      success
      total
      proportion
      failure
      population
      std_err
      log_odds
    
  
  
    
      42
      SD
      4
      10
      0.400000
      6
      865454.0
      0.154919
      -0.405465
    
    
      19
      MA
      154
      447
      0.344519
      293
      6811779.0
      0.022477
      -0.643220
    
    
      32
      NM
      17
      57
      0.298246
      40
      2081015.0
      0.060596
      -0.855666
    
    
      0
      AK
      5
      17
      0.294118
      12
      741894.0
      0.110510
      -0.875469
    
    
      20
      MD
      64
      230
      0.278261
      166
      6016447.0
      0.029550
      -0.953105
    
    
      12
      IA
      20
      72
      0.277778
      52
      3134693.0
      0.052786
      -0.955511
    
    
      7
      DC
      106
      385
      0.275325
      279
      681170.0
      0.022765
      -0.967773
    
    
      40
      RI
      11
      40
      0.275000
      29
      1056426.0
      0.070600
      -0.969401
    
    
      34
      NY
      456
      1660
      0.274699
      1204
      19745289.0
      0.010956
      -0.970912
    
    
      29
      NE
      10
      37
      0.270270
      27
      1907116.0
      0.073009
      -0.993252
    
  








    



Optimization terminated successfully.
         Current function value: 0.490474
         Iterations 5






    






        Model:               Logit       Pseudo R-squared:     0.004  


  Dependent Variable:      url_type            AIC:         14580.9189


         Date:         2018-01-25 21:09        BIC:         14603.7378


   No. Observations:         14858        Log-Likelihood:     -7287.5 


       Df Model:               2             LL-Null:         -7319.0 


     Df Residuals:           14855         LLR p-value:     1.9532e-14


      Converged:            1.0000            Scale:          1.0000  


    No. Iterations:         5.0000                                   




                        Coef.   Std.Err.     z      P>|z|  [0.025   0.975] 


  Intercept             -0.1177   0.3301   -0.3565  0.7215  -0.7646  0.5292 


  state_color[T.r]      -0.3293   0.0441   -7.4693  0.0000  -0.4157  -0.2429


  np.log10(population)  -0.1670   0.0462   -3.6135  0.0003  -0.2575  -0.0764








    



/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [16]:

    
pd.Series(STATE_COLORS.values()).value_counts()









    Out[16]:





r    30
b    21
dtype: int64



In [17]:

    
df[
    (df.u_state != "USA")
  & (~df.u_state.isnull()) 
  & (df.t_n_urls > 0)
  #& (df.topic_name == topic)
].assign(state_color=lambda x: x.u_state.map(STATE_COLORS)).groupby("state_color").t_n_urls.count()









    Out[17]:





state_color
b    38521
r    29255
Name: t_n_urls, dtype: int64



In [18]:

    
df.CATS_Counter.head()









    Out[18]:





0    {u'NONE': 1}
1    {u'NONE': 1}
2    {u'NONE': 1}
3    {u'NONE': 1}
4    {u'NONE': 1}
Name: CATS_Counter, dtype: object



In [19]:

    
df.assign(
    UNK_count=df.CATS_Counter.apply(lambda x: x.get("UNK", 0))
).groupby("is_controversial").UNK_count.agg([np.sum, np.mean])









    Out[19]:







  
    
      
      sum
      mean
    
    
      is_controversial
      
      
    
  
  
    
      False
      9799
      0.099780
    
    
      True
      21192
      0.142551



In [25]:

    
df_state_votes = pd.DataFrame(
    STATE_VOTES[1:],
    columns=[
        "State", "Total",
        "Hillary Votes", "Hillary Percent", "Hillary EV",
        "Trump Votes", "Trump Percent", "Trump EV",
        "Other Votes", "Other Percent", "Other EV",
    ]
).astype({
    "State": "str", "Total": "int",
        "Hillary Votes": "int", "Hillary Percent": "float", "Hillary EV": "str",
        "Trump Votes": "int", "Trump Percent": "float", "Trump EV": "str",
        "Other Votes": "int", "Other Percent": "float", "Other EV": "str",
}).assign(hill_trump_diff=lambda x: x["Hillary Percent"] - x["Trump Percent"])
df_state_votes.head()









    Out[25]:







  
    
      
      State
      Total
      Hillary Votes
      Hillary Percent
      Hillary EV
      Trump Votes
      Trump Percent
      Trump EV
      Other Votes
      Other Percent
      Other EV
      hill_trump_diff
    
  
  
    
      0
      Alabama
      2123372
      729547
      34.4
      
      1318255
      62.1
      9
      44467
      2.1
      
      -27.7
    
    
      1
      Alaska
      318608
      116454
      36.6
      
      163387
      51.3
      3
      18725
      5.9
      
      -14.7
    
    
      2
      Arizona
      2573165
      1161167
      45.1
      
      1252401
      48.7
      11
      106327
      4.1
      
      -3.6
    
    
      3
      Arkansas
      1130635
      380494
      33.7
      
      684872
      60.6
      6
      29829
      2.6
      
      -26.9
    
    
      4
      California
      14181595
      8753788
      61.7
      55
      4483810
      31.6
      
      478500
      3.4
      
      30.1



In [26]:

    
df_state_votes.sort_values("hill_trump_diff").head(10)









    Out[26]:







  
    
      
      State
      Total
      Hillary Votes
      Hillary Percent
      Hillary EV
      Trump Votes
      Trump Percent
      Trump EV
      Other Votes
      Other Percent
      Other EV
      hill_trump_diff
    
  
  
    
      50
      Wyoming
      255849
      55973
      21.9
      
      174419
      68.2
      3
      13287
      5.2
      None
      -46.3
    
    
      48
      West Virginia
      713051
      188794
      26.5
      
      489371
      68.6
      5
      23004
      3.2
      
      -42.1
    
    
      36
      Oklahoma
      1452992
      420375
      28.9
      
      949136
      65.3
      7
      83481
      5.7
      
      -36.4
    
    
      34
      North Dakota
      344360
      93758
      27.2
      
      216794
      63.0
      3
      21434
      6.2
      
      -35.8
    
    
      12
      Idaho
      690255
      189765
      27.5
      
      409055
      59.3
      4
      28331
      4.1
      
      -31.8
    
    
      41
      South Dakota
      370093
      117458
      31.7
      
      227721
      61.5
      3
      20850
      5.6
      
      -29.8
    
    
      17
      Kentucky
      1924149
      628854
      32.7
      
      1202971
      62.5
      8
      53752
      2.8
      
      -29.8
    
    
      0
      Alabama
      2123372
      729547
      34.4
      
      1318255
      62.1
      9
      44467
      2.1
      
      -27.7
    
    
      3
      Arkansas
      1130635
      380494
      33.7
      
      684872
      60.6
      6
      29829
      2.6
      
      -26.9
    
    
      42
      Tennessee
      2508027
      870695
      34.7
      
      1522925
      60.7
      11
      70397
      2.8
      
      -26.0



In [27]:

    
df_state_votes.sort_values("hill_trump_diff", ascending=False).head(10)









    Out[27]:







  
    
      
      State
      Total
      Hillary Votes
      Hillary Percent
      Hillary EV
      Trump Votes
      Trump Percent
      Trump EV
      Other Votes
      Other Percent
      Other EV
      hill_trump_diff
    
  
  
    
      8
      Dist. of Col.
      311268
      282830
      90.9
      3
      12723
      4.1
      
      4906
      1.6
      
      86.8
    
    
      11
      Hawaii
      428937
      266891
      62.2
      3*
      128847
      30.0
      
      15954
      3.7
      
      32.2
    
    
      4
      California
      14181595
      8753788
      61.7
      55
      4483810
      31.6
      
      478500
      3.4
      
      30.1
    
    
      21
      Massachusetts
      3325046
      1995196
      60.0
      11
      1090893
      32.8
      
      138018
      4.2
      
      27.2
    
    
      45
      Vermont
      315067
      178573
      56.7
      3
      95369
      30.3
      
      10078
      3.2
      
      26.4
    
    
      20
      Maryland
      2781446
      1677928
      60.3
      10
      943169
      33.9
      
      79605
      2.9
      
      26.4
    
    
      32
      New York
      7721453
      4556124
      59.0
      29
      2819534
      36.5
      
      176598
      2.3
      
      22.5
    
    
      13
      Illinois
      5536424
      3090729
      55.8
      20
      2146015
      38.8
      
      209596
      3.8
      
      17.0
    
    
      47
      Washington
      3209214
      1742718
      54.3
      8*
      1221747
      38.1
      
      160879
      5.0
      
      16.2
    
    
      39
      Rhode Island
      464144
      252525
      54.4
      4
      180543
      38.9
      
      14746
      3.2
      
      15.5



In [30]:

    
df_state_votes.hill_trump_diff.hist(bins=np.arange(-100, 105, 5))









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f3299e94150>



In [31]:

    
(df_state_votes.hill_trump_diff > 0).value_counts()









    Out[31]:





False    30
True     21
Name: hill_trump_diff, dtype: int64



In [33]:

    
df_state_votes[df_state_votes.State.isin([
    "Alaska", "Alabama", "Iowa",
    "Delaware", "Vermont", "Hawaii"
])]









    Out[33]:







  
    
      
      State
      Total
      Hillary Votes
      Hillary Percent
      Hillary EV
      Trump Votes
      Trump Percent
      Trump EV
      Other Votes
      Other Percent
      Other EV
      hill_trump_diff
    
  
  
    
      0
      Alabama
      2123372
      729547
      34.4
      
      1318255
      62.1
      9
      44467
      2.1
      
      -27.7
    
    
      1
      Alaska
      318608
      116454
      36.6
      
      163387
      51.3
      3
      18725
      5.9
      
      -14.7
    
    
      7
      Delaware
      441590
      235603
      53.4
      3
      185127
      41.9
      
      14757
      3.3
      
      11.5
    
    
      11
      Hawaii
      428937
      266891
      62.2
      3*
      128847
      30.0
      
      15954
      3.7
      
      32.2
    
    
      15
      Iowa
      1566031
      653669
      41.7
      
      800983
      51.1
      6
      59186
      3.8
      
      -9.4
    
    
      45
      Vermont
      315067
      178573
      56.7
      3
      95369
      30.3
      
      10078
      3.2
      
      26.4



In [ ]:

Model:	Logit	Pseudo R-squared:	0.006
Dependent Variable:	url_type	AIC:	8527.2848
Date:	2018-01-25 21:09	BIC:	8549.2115
No. Observations:	11036	Log-Likelihood:	-4260.6
Df Model:	2	LL-Null:	-4287.5
Df Residuals:	11033	LLR p-value:	2.1182e-12
Converged:	1.0000	Scale:	1.0000
No. Iterations:	6.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
Intercept	-2.2820	0.4656	-4.9009	0.0000	-3.1946	-1.3694
state_color[T.r]	0.4146	0.0567	7.3075	0.0000	0.3034	0.5258
np.log10(population)	0.0262	0.0660	0.3974	0.6911	-0.1031	0.1556

	u_state	success	total	proportion	failure	population	std_err	log_odds
43	SD	5	10	0.500000	5	865454.0	0.158114	0.000000
0	AK	5	14	0.357143	9	741894.0	0.128060	-0.587787
1	AL	32	130	0.246154	98	4863300.0	0.037781	-1.119232
51	WV	4	20	0.200000	16	1831102.0	0.089443	-1.386294
50	WI	41	206	0.199029	165	5778708.0	0.027818	-1.392373
47	VA	61	330	0.184848	269	8411808.0	0.021368	-1.483837
13	IA	12	67	0.179104	55	3134693.0	0.046845	-1.522426
10	FL	126	707	0.178218	581	20612439.0	0.014393	-1.528469
45	TX	167	938	0.178038	771	27862596.0	0.012491	-1.529695
44	TN	29	165	0.175758	136	6651194.0	0.029631	-1.545359

	u_state	success	total	proportion	failure	population	std_err	log_odds
0	AK	8	41	0.195122	33	741894.0	0.061891	-1.417066
1	AL	33	219	0.150685	186	4863300.0	0.024174	-1.729239
21	ME	9	88	0.102273	79	1331479.0	0.032301	-2.172223
12	IA	12	139	0.086331	127	3134693.0	0.023822	-2.359280
32	NJ	43	500	0.086000	457	8944469.0	0.012538	-2.363483
27	MT	4	47	0.085106	43	1042520.0	0.040702	-2.374906
50	WI	20	257	0.077821	237	5778708.0	0.016711	-2.472328
51	WV	2	26	0.076923	24	1831102.0	0.052259	-2.484907
23	MN	15	200	0.075000	185	5519952.0	0.018625	-2.512306
9	FL	92	1252	0.073482	1160	20612439.0	0.007374	-2.534387

	u_state	success	total	proportion	failure	population	std_err	log_odds
32	NM	14	57	0.245614	43	2081015.0	0.057015	-1.122143
5	CO	51	232	0.219828	181	5540545.0	0.027189	-1.266671
33	NV	38	176	0.215909	138	2940058.0	0.031014	-1.289667
21	ME	14	74	0.189189	60	1331479.0	0.045529	-1.455287
49	WI	37	197	0.187817	160	5778708.0	0.027827	-1.464256
3	AZ	47	265	0.177358	218	6931071.0	0.023464	-1.534347
0	AK	3	17	0.176471	14	741894.0	0.092459	-1.540445
13	ID	7	40	0.175000	33	1683140.0	0.060078	-1.550597
6	CT	15	90	0.166667	75	3576452.0	0.039284	-1.609438
31	NJ	53	336	0.157738	283	8944469.0	0.019885	-1.675155

	u_state	success	total	proportion	failure	population	std_err	log_odds
8	DC	194	429	0.452214	235	681170.0	0.024030	-0.191727
2	AR	24	68	0.352941	44	2988248.0	0.057952	-0.606136
35	NY	394	1154	0.341421	760	19745289.0	0.013959	-0.656968
29	ND	2	6	0.333333	4	757952.0	0.192450	-0.693147
48	VT	7	21	0.333333	14	624594.0	0.102869	-0.693147
15	IL	132	421	0.313539	289	12801539.0	0.022611	-0.783625
12	HI	9	29	0.310345	20	1428557.0	0.085909	-0.798508
49	WA	122	394	0.309645	272	7288000.0	0.023293	-0.801781
6	CO	73	239	0.305439	166	5540545.0	0.029793	-0.821528
0	AK	4	14	0.285714	10	741894.0	0.120736	-0.916291

	u_state	success	total	proportion	failure	population	std_err	log_odds
8	DE	10	22	0.454545	12	952065.0	0.106159	-0.182322
48	VT	16	48	0.333333	32	624594.0	0.068041	-0.693147
28	NC	168	508	0.330709	340	10146788.0	0.020874	-0.704982
24	MO	62	189	0.328042	127	6093000.0	0.034151	-0.717053
11	HI	27	84	0.321429	57	1428557.0	0.050957	-0.747214
7	DC	574	1846	0.310943	1272	681170.0	0.010773	-0.795716
19	MA	184	607	0.303130	423	6811779.0	0.018655	-0.832436
31	NH	31	103	0.300971	72	1334795.0	0.045195	-0.842679
29	ND	3	10	0.300000	7	757952.0	0.144914	-0.847298
33	NM	18	61	0.295082	43	2081015.0	0.058395	-0.870828

	u_state	success	total	proportion	failure	population	std_err	log_odds
42	SD	4	10	0.400000	6	865454.0	0.154919	-0.405465
19	MA	154	447	0.344519	293	6811779.0	0.022477	-0.643220
32	NM	17	57	0.298246	40	2081015.0	0.060596	-0.855666
0	AK	5	17	0.294118	12	741894.0	0.110510	-0.875469
20	MD	64	230	0.278261	166	6016447.0	0.029550	-0.953105
12	IA	20	72	0.277778	52	3134693.0	0.052786	-0.955511
7	DC	106	385	0.275325	279	681170.0	0.022765	-0.967773
40	RI	11	40	0.275000	29	1056426.0	0.070600	-0.969401
34	NY	456	1660	0.274699	1204	19745289.0	0.010956	-0.970912
29	NE	10	37	0.270270	27	1907116.0	0.073009	-0.993252

	State	Total	Hillary Votes	Hillary Percent	Hillary EV	Trump Votes	Trump Percent	Trump EV	Other Votes	Other Percent	hill_trump_diff
0	Alabama	2123372	729547	34.4		1318255	62.1	9	44467	2.1	-27.7
1	Alaska	318608	116454	36.6		163387	51.3	3	18725	5.9	-14.7
2	Arizona	2573165	1161167	45.1		1252401	48.7	11	106327	4.1	-3.6
3	Arkansas	1130635	380494	33.7		684872	60.6	6	29829	2.6	-26.9
4	California	14181595	8753788	61.7	55	4483810	31.6		478500	3.4	30.1

	State	Total	Hillary Votes	Hillary Percent	Trump Votes	Trump Percent	Trump EV	Other Votes	Other Percent	Other EV	hill_trump_diff
50	Wyoming	255849	55973	21.9	174419	68.2	3	13287	5.2	None	-46.3
48	West Virginia	713051	188794	26.5	489371	68.6	5	23004	3.2		-42.1
36	Oklahoma	1452992	420375	28.9	949136	65.3	7	83481	5.7		-36.4
34	North Dakota	344360	93758	27.2	216794	63.0	3	21434	6.2		-35.8
12	Idaho	690255	189765	27.5	409055	59.3	4	28331	4.1		-31.8
41	South Dakota	370093	117458	31.7	227721	61.5	3	20850	5.6		-29.8
17	Kentucky	1924149	628854	32.7	1202971	62.5	8	53752	2.8		-29.8
0	Alabama	2123372	729547	34.4	1318255	62.1	9	44467	2.1		-27.7
3	Arkansas	1130635	380494	33.7	684872	60.6	6	29829	2.6		-26.9
42	Tennessee	2508027	870695	34.7	1522925	60.7	11	70397	2.8		-26.0

	State	Total	Hillary Votes	Hillary Percent	Hillary EV	Trump Votes	Trump Percent	Other Votes	Other Percent	hill_trump_diff
8	Dist. of Col.	311268	282830	90.9	3	12723	4.1	4906	1.6	86.8
11	Hawaii	428937	266891	62.2	3*	128847	30.0	15954	3.7	32.2
4	California	14181595	8753788	61.7	55	4483810	31.6	478500	3.4	30.1
21	Massachusetts	3325046	1995196	60.0	11	1090893	32.8	138018	4.2	27.2
45	Vermont	315067	178573	56.7	3	95369	30.3	10078	3.2	26.4
20	Maryland	2781446	1677928	60.3	10	943169	33.9	79605	2.9	26.4
32	New York	7721453	4556124	59.0	29	2819534	36.5	176598	2.3	22.5
13	Illinois	5536424	3090729	55.8	20	2146015	38.8	209596	3.8	17.0
47	Washington	3209214	1742718	54.3	8*	1221747	38.1	160879	5.0	16.2
39	Rhode Island	464144	252525	54.4	4	180543	38.9	14746	3.2	15.5